rm(list=ls())
require(quanteda)
require(lubridate)
require(stringi)

dat <- readRDS("data/data_politics_asahi.RDS")
corp <- corpus(dat, text_field = "body")
summ <- summary(corp, remove_punct = TRUE, n = ndoc(corp))

ex <- stri_detect_regex(corp$head, "動静|掲示板|素粒子|お知らせ|はがき通信|インデックス|（.*）")

set.seed(1234)
corp_short <- corpus_subset(corp, 100 < summ$Tokens & summ$Tokens < 400 & !ex) %>% 
  corpus_sample(500)

table(lubridate::year(corp_short$date))

dat_short <- convert(corp_short, "data.frame")
dat_short$text <- stri_replace_all_regex(dat_short$text, "\\n", "")
write.csv(dat_short, "coding/sample.csv")
